# Copyright (C) 2005 Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
+# Copyright (C) 2005 XenSource Ltd
# This file is subject to the terms and conditions of the GNU General
# Public License. See the file "COPYING" in the main directory of
import xen.lowlevel.xc
-import XendDomainInfo
from xen.xend.xenstore.xsutil import IntroduceDomain
from XendError import XendError
raise XendError(errmsg)
return buf
-def save(xd, fd, dominfo, live):
+def save(fd, dominfo, live):
write_exact(fd, SIGNATURE, "could not write guest state file: signature")
config = sxp.to_string(dominfo.sxpr())
- write_exact(fd, pack("!i", len(config)),
- "could not write guest state file: config len")
- write_exact(fd, config, "could not write guest state file: config")
-
- # xc_save takes three customization parameters: maxit, max_f, and flags
- # the last controls whether or not save is 'live', while the first two
- # further customize behaviour when 'live' save is enabled. Passing "0"
- # simply uses the defaults compiled into libxenguest; see the comments
- # and/or code in xc_linux_save() for more information.
- cmd = [PATH_XC_SAVE, str(xc.handle()), str(fd),
- str(dominfo.getDomid()), "0", "0", str(int(live)) ]
- log.info("[xc_save] " + join(cmd))
- child = xPopen3(cmd, True, -1, [fd, xc.handle()])
+
+ domain_name = dominfo.getName()
+
+ if live:
+ dominfo.setName('migrating-' + domain_name)
+
+ try:
+ write_exact(fd, pack("!i", len(config)),
+ "could not write guest state file: config len")
+ write_exact(fd, config, "could not write guest state file: config")
+
+ # xc_save takes three customization parameters: maxit, max_f, and
+ # flags the last controls whether or not save is 'live', while the
+ # first two further customize behaviour when 'live' save is
+ # enabled. Passing "0" simply uses the defaults compiled into
+ # libxenguest; see the comments and/or code in xc_linux_save() for
+ # more information.
+ cmd = [PATH_XC_SAVE, str(xc.handle()), str(fd),
+ str(dominfo.getDomid()), "0", "0", str(int(live)) ]
+ log.info("[xc_save] " + join(cmd))
+ child = xPopen3(cmd, True, -1, [fd, xc.handle()])
- lasterr = ""
- p = select.poll()
- p.register(child.fromchild.fileno())
- p.register(child.childerr.fileno())
- while True:
- r = p.poll()
- for (fd, event) in r:
- if not event & select.POLLIN:
- continue
- if fd == child.childerr.fileno():
- l = child.childerr.readline()
- log.error(l.rstrip())
- lasterr = l.rstrip()
- if fd == child.fromchild.fileno():
- l = child.fromchild.readline()
- if l.rstrip() == "suspend":
- log.info("suspending %d" % dominfo.getDomid())
- xd.domain_shutdown(dominfo.getDomid(), reason='suspend')
- dominfo.state_wait(XendDomainInfo.STATE_VM_SUSPENDED)
- log.info("suspend %d done" % dominfo.getDomid())
- child.tochild.write("done\n")
- child.tochild.flush()
- if filter(lambda (fd, event): event & select.POLLHUP, r):
- break
-
- if child.wait() >> 8 == 127:
- lasterr = "popen %s failed" % PATH_XC_SAVE
- if child.wait() != 0:
- raise XendError("xc_save failed: %s" % lasterr)
-
- dominfo.destroy()
- return None
-
-def restore(fd):
+ lasterr = ""
+ p = select.poll()
+ p.register(child.fromchild.fileno())
+ p.register(child.childerr.fileno())
+ while True:
+ r = p.poll()
+ for (fd, event) in r:
+ if not event & select.POLLIN:
+ continue
+ if fd == child.childerr.fileno():
+ l = child.childerr.readline()
+ log.error(l.rstrip())
+ lasterr = l.rstrip()
+ if fd == child.fromchild.fileno():
+ l = child.fromchild.readline()
+ if l.rstrip() == "suspend":
+ log.info("suspending %d", dominfo.getDomid())
+ dominfo.shutdown('suspend')
+ dominfo.waitForShutdown()
+ log.info("suspend %d done", dominfo.getDomid())
+ child.tochild.write("done\n")
+ child.tochild.flush()
+ if filter(lambda (fd, event): event & select.POLLHUP, r):
+ break
+
+ if child.wait() >> 8 == 127:
+ lasterr = "popen %s failed" % PATH_XC_SAVE
+ if child.wait() != 0:
+ raise XendError("xc_save failed: %s" % lasterr)
+
+ dominfo.destroyDomain()
+ except Exception, exn:
+ log.exception("Save failed on domain %s (%d).", domain_name,
+ dominfo.getDomid())
+ try:
+ if live:
+ dominfo.setName(domain_name)
+ except:
+ log.exception("Failed to reset the migrating domain's name")
+ raise Exception, exn
+
+
+def restore(xd, fd):
signature = read_exact(fd, len(SIGNATURE),
"not a valid guest state file: signature read")
if signature != SIGNATURE:
raise XendError("not a valid guest state file: config parse")
vmconfig = p.get_val()
- dominfo = XendDomainInfo.restore(vmconfig)
- l = read_exact(fd, sizeof_unsigned_long,
- "not a valid guest state file: pfn count read")
- nr_pfns = unpack("=L", l)[0] # XXX endianess
- if nr_pfns > 1024*1024: # XXX
- raise XendError(
- "not a valid guest state file: pfn count out of range")
+ dominfo = xd.restore_(vmconfig)
- if dominfo.store_channel:
- store_evtchn = dominfo.store_channel.port2
- else:
- store_evtchn = 0
+ assert dominfo.store_channel
+ assert dominfo.console_channel
+
+ try:
+ l = read_exact(fd, sizeof_unsigned_long,
+ "not a valid guest state file: pfn count read")
+ nr_pfns = unpack("=L", l)[0] # XXX endianess
+ if nr_pfns > 1024*1024: # XXX
+ raise XendError(
+ "not a valid guest state file: pfn count out of range")
- if dominfo.console_channel:
+ store_evtchn = dominfo.store_channel.port2
console_evtchn = dominfo.console_channel.port2
- else:
- console_evtchn = 0
-
- cmd = [PATH_XC_RESTORE, str(xc.handle()), str(fd),
- str(dominfo.getDomid()), str(nr_pfns),
- str(store_evtchn), str(console_evtchn)]
- log.info("[xc_restore] " + join(cmd))
- child = xPopen3(cmd, True, -1, [fd, xc.handle()])
- child.tochild.close()
-
- lasterr = ""
- p = select.poll()
- p.register(child.fromchild.fileno())
- p.register(child.childerr.fileno())
- while True:
- r = p.poll()
- for (fd, event) in r:
- if not event & select.POLLIN:
- continue
- if fd == child.childerr.fileno():
- l = child.childerr.readline()
- log.error(l.rstrip())
- lasterr = l.rstrip()
- if fd == child.fromchild.fileno():
- l = child.fromchild.readline()
- while l:
- log.info(l.rstrip())
- m = re.match(r"^(store-mfn) (\d+)\n$", l)
- if m:
- if dominfo.store_channel:
+
+ cmd = [PATH_XC_RESTORE, str(xc.handle()), str(fd),
+ str(dominfo.getDomid()), str(nr_pfns),
+ str(store_evtchn), str(console_evtchn)]
+ log.info("[xc_restore] " + join(cmd))
+ child = xPopen3(cmd, True, -1, [fd, xc.handle()])
+ child.tochild.close()
+
+ lasterr = ""
+ p = select.poll()
+ p.register(child.fromchild.fileno())
+ p.register(child.childerr.fileno())
+ while True:
+ r = p.poll()
+ for (fd, event) in r:
+ if not event & select.POLLIN:
+ continue
+ if fd == child.childerr.fileno():
+ l = child.childerr.readline()
+ log.error(l.rstrip())
+ lasterr = l.rstrip()
+ if fd == child.fromchild.fileno():
+ l = child.fromchild.readline()
+ while l:
+ log.info(l.rstrip())
+ m = re.match(r"^(store-mfn) (\d+)\n$", l)
+ if m:
store_mfn = int(m.group(2))
dominfo.setStoreRef(store_mfn)
IntroduceDomain(dominfo.getDomid(),
store_mfn,
dominfo.store_channel.port1,
dominfo.getDomainPath())
- m = re.match(r"^(console-mfn) (\d+)\n$", l)
- if m:
- dominfo.setConsoleRef(int(m.group(2)))
- try:
- l = child.fromchild.readline()
- except:
- l = None
- if filter(lambda (fd, event): event & select.POLLHUP, r):
- break
-
- if child.wait() >> 8 == 127:
- lasterr = "popen %s failed" % PATH_XC_RESTORE
- if child.wait() != 0:
- raise XendError("xc_restore failed: %s" % lasterr)
-
- return dominfo
+ m = re.match(r"^(console-mfn) (\d+)\n$", l)
+ if m:
+ dominfo.setConsoleRef(int(m.group(2)))
+ try:
+ l = child.fromchild.readline()
+ except:
+ l = None
+ if filter(lambda (fd, event): event & select.POLLHUP, r):
+ break
+
+ if child.wait() >> 8 == 127:
+ lasterr = "popen %s failed" % PATH_XC_RESTORE
+ if child.wait() != 0:
+ raise XendError("xc_restore failed: %s" % lasterr)
+
+ return dominfo
+ except:
+ log.exception("Restore failed")
+ dominfo.destroy()
+ raise
"""
try:
- fd = os.open(src, os.O_RDONLY)
- dominfo = XendCheckpoint.restore(fd)
- self._add_domain(dominfo)
- return dominfo
+ return self.domain_restore_fd(os.open(src, os.O_RDONLY))
except OSError, ex:
raise XendError("can't read guest state file %s: %s" %
(src, ex[1]))
+ def domain_restore_fd(self, fd):
+ """Restore a domain from the given file descriptor."""
+
+ try:
+ XendCheckpoint.restore(self, fd)
+ except Exception, ex:
+ log.exception("Restore failed")
+ raise
+
+
+ def restore_(self, config):
+ """Create a domain as part of the restore process. This is called
+ only from {@link XendCheckpoint}.
+
+ A restore request comes into XendDomain through {@link
+ #domain_restore} or {@link #domain_restore_fd}. That request is
+ forwarded immediately to XendCheckpoint which, when it is ready, will
+ call this method. It is necessary to come through here rather than go
+ directly to {@link XendDomainInfo.restore} because we need to
+ serialise the domain creation process, but cannot lock
+ domain_restore_fd as a whole, otherwise we will deadlock waiting for
+ the old domain to die.
+ """
+ self.domains_lock.acquire()
+ try:
+ dominfo = XendDomainInfo.restore(config)
+ self._add_domain(dominfo)
+ return dominfo
+ finally:
+ self.domains_lock.release()
+
def domain_lookup(self, id):
self.domains_lock.acquire()
port = xroot.get_xend_relocation_port()
sock = relocate.setupRelocation(dst, port)
- # temporarily rename domain for localhost migration
- if dst == "localhost":
- dominfo.setName("tmp-" + dominfo.getName())
-
- try:
- XendCheckpoint.save(self, sock.fileno(), dominfo, live)
- except:
- if dst == "localhost":
- dominfo.setName(
- string.replace(dominfo.getName(), "tmp-", "", 1))
- raise
+ XendCheckpoint.save(sock.fileno(), dominfo, live)
- return None
def domain_save(self, id, dst):
"""Start saving a domain to file.
fd = os.open(dst, os.O_WRONLY | os.O_CREAT | os.O_TRUNC)
# For now we don't support 'live checkpoint'
- return XendCheckpoint.save(self, fd, dominfo, False)
+ return XendCheckpoint.save(fd, dominfo, False)
except OSError, ex:
raise XendError("can't write guest state file %s: %s" %
STATE_VM_OK = "ok"
STATE_VM_TERMINATED = "terminated"
-STATE_VM_SUSPENDED = "suspended"
"""Flag for a block device backend domain."""
SIF_BLK_BE_DOMAIN = (1<<4)
# The domain no longer exists. This will occur if we have
# scheduled a timer to check for shutdown timeouts and the
# shutdown succeeded. It will also occur if someone
- # destroys a domain beneath us. We clean up, just in
- # case.
+ # destroys a domain beneath us. We clean up the domain,
+ # just in case, but we can't clean up the VM, because that
+ # VM may have migrated to a different domain on this
+ # machine.
self.cleanupDomain()
- self.cleanupVm()
return
if xeninfo['dying']:
# Dying means that a domain has been destroyed, but has not
- # yet been cleaned up by Xen. This could persist indefinitely
- # if, for example, another domain has some of its pages
- # mapped. We might like to diagnose this problem in the
- # future, but for now all we do is make sure that it's not
- # us holding the pages, by calling the cleanup methods.
+ # yet been cleaned up by Xen. This state could persist
+ # indefinitely if, for example, another domain has some of its
+ # pages mapped. We might like to diagnose this problem in the
+ # future, but for now all we do is make sure that it's not us
+ # holding the pages, by calling cleanupDomain. We can't
+ # clean up the VM, as above.
self.cleanupDomain()
- self.cleanupVm()
return
elif xeninfo['crashed']:
restart_reason = 'crash'
elif xeninfo['shutdown']:
- if self.readDom('xend/shutdown'):
+ if self.readDom('xend/shutdown_completed'):
# We've seen this shutdown already, but we are preserving
# the domain for debugging. Leave it alone.
- pass
+ return
+
else:
reason = shutdown_reason(xeninfo['shutdown_reason'])
self.clearRestart()
if reason == 'suspend':
- self.state_set(STATE_VM_SUSPENDED)
+ self.state_set(STATE_VM_TERMINATED)
# Don't destroy the domain. XendCheckpoint will do
# this once it has finished.
elif reason in ['poweroff', 'reboot']:
if not reason in shutdown_reasons.values():
raise XendError('invalid reason:' + reason)
self.storeDom("control/shutdown", reason)
- if not reason == 'suspend':
+ if reason != 'suspend':
self.storeDom('xend/shutdown_start_time', time.time())
"rename-restart" : self.renameRestart}[self.info['on_' + reason]]()
- def preserve(self):
- log.info("Preserving dead domain %s (%d).", self.info['name'],
- self.domid)
-
-
def renameRestart(self):
self.restart(True)
## public:
- def state_wait(self, state):
+ def waitForShutdown(self):
self.state_updated.acquire()
- while self.state != state:
+ while self.state == STATE_VM_OK:
self.state_updated.wait()
self.state_updated.release()
"""Cleanup domain resources; release devices. Idempotent. Nothrow
guarantee."""
- self.state_set(STATE_VM_TERMINATED)
self.release_devices()
self.closeStoreChannel()
self.closeConsoleChannel()
log.debug("XendDomainInfo.destroy: domid=%s", str(self.domid))
- self.cleanupDomain()
self.cleanupVm()
+ self.destroyDomain()
+
+
+ def destroyDomain(self):
+ log.debug("XendDomainInfo.destroyDomain(%s)", str(self.domid))
+
+ self.cleanupDomain()
try:
if self.domid is not None:
except Exception:
log.exception("XendDomainInfo.destroy: xc.domain_destroy failed.")
+ self.state_set(STATE_VM_TERMINATED)
+
## private:
try:
if rename:
- self.preserveShutdownDomain()
+ self.preserveForRestart()
else:
- self.cleanupDomain()
self.destroy()
try:
xd = get_component('xen.xend.XendDomain')
- xd.domain_unpause(xd.domain_create(config).getDomid())
+ new_dom = xd.domain_create(config)
+ try:
+ xc.domain_unpause(new_dom.getDomid())
+ except:
+ new_dom.destroy()
+ raise
except Exception, exn:
log.exception('Failed to restart domain %d.', self.domid)
finally:
# self.exportToDB()
- def preserveShutdownDomain(self):
+ def preserveForRestart(self):
"""Preserve a domain that has been shut down, by giving it a new UUID,
cloning the VM details, and giving it a new name. This allows us to
keep this domain for debugging, but restart a new one in its place
self.uuid = new_uuid
self.vmpath = VMROOT + new_uuid
self.storeVmDetails()
- self.storeDom('vm', self.vmpath)
- self.storeDom('xend/shutdown', 'True')
+ self.preserve()
+
+
+ def preserve(self):
+ log.info("Preserving dead domain %s (%d).", self.info['name'],
+ self.domid)
+ self.storeDom('xend/shutdown_completed', 'True')
+ self.set_state(STATE_VM_TERMINATED)
def generateShutdownName(self):